HW 02

Author

Nandakumar Kuthalaraja

if (!require("pacman")) 
  install.packages("pacman")
Loading required package: pacman
pacman::p_load(tidyverse, ggforce,palmerpenguins,ggridges, glue, scales, ggthemes, openintro, ggrepel, dsbox, janitor, fs)

# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# set width of code output
options(width = 85)

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)



###All responses are in comments within the code****

1 - A new day, a new plot, a new geom

#read from dsbox pkg dataset
#glimpse(edibnb)

edibnb <- edibnb |>
  filter(!is.na(neighbourhood), !is.na(review_scores_rating))
median_rating <- edibnb |>
  group_by(neighbourhood) |>
  summarise(median_review_score = median(review_scores_rating, na.rm = TRUE))|>
  arrange(median_review_score)# got the median review score per neighborhood
  
#glimpse(median_rating)

neighborhood_order <- edibnb |>
  mutate(neighbourhood = factor(neighbourhood,  levels = median_rating$neighbourhood)) #ordering

ggplot(neighborhood_order, aes(x = review_scores_rating, y = neighbourhood)) +
  geom_density_ridges(alpha = 0.7) +
  theme(legend.position = "none",
        plot.subtitle = element_text(size = 8)) + 
  labs(
    title = "Airbnb Review of Edinburgh Neighborhood",
    x = "Review Score (0–100)",
    y = "Neighborhood",
    subtitle = "DensityRidge Plot"
  )
Picking joint bandwidth of 1.23

This plot, shows the distribution of Airbnb review scores across different neighborhoods in Edinburgh. Each ridge represents a neighborhood and illustrates how review scores are spread out for listings in that area. The highest scoring neighborhoods tend to have a “taller” distribution curve, implying a tighter spread of high review scores compared to the bottom few.

2 - Foreign Connected PACs

# get a list of files with "Foreign Connected PAC" in their names
list_of_files <- dir_ls(path = "data", regexp = "Foreign Connected PAC")

# read all files and bind
pac <- read_csv(list_of_files, id = "year")

# Clean the column names
cleaned_pac <- clean_names(pac)|>
  separate(`country_of_origin_parent_company`, into = c("country_of_origin", "parent_company"),
           sep = "/") |>
  mutate(
  yearStr = str_split(year, "-", simplify = TRUE)[, 2],
  clean_year_string = str_remove(yearStr, fixed(".csv")),
  year = as.integer(clean_year_string),
  repubs = as.numeric(str_remove(repubs, "\\$")),
  dems = as.numeric(str_remove(dems, "\\$"))
) |>
  select(-yearStr, -clean_year_string, -total)

#glimpse(cleaned_pac)

updated_pac_data <- cleaned_pac |>
  pivot_longer(
    cols = c(dems, repubs),      
    names_to = "party",           
    values_to = "amount"          
  )

updated_pac_data <- updated_pac_data |>
  mutate(
    party = case_when(
      party == "dems" ~ "Democrat",
      party == "repubs" ~ "Republican"
    )
  )
uk_pac <- updated_pac_data |>
  filter(country_of_origin == "UK") |> group_by(year, party) |>
  summarise(total_amount = sum(amount), .groups = "drop")

#glimpse(uk_pac)

ggplot(uk_pac, aes(x = year, y = total_amount / 1000000, color = party)) +
  geom_line(linewidth = 1.1) +
  labs(
    y = "Total amount",
    x = "Year",
    title = "UK-connected PACs' Contributions to US political parties"
  ) + 
  scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "M")) + 
  scale_color_manual(name = "Party", values = c("Democrat" = "blue", "Republican" = "red")) 

Mexico_pac <- updated_pac_data |>
  filter(country_of_origin == "Mexico") |> group_by(year, party) |>
  summarise(total_amount = sum(amount))

#glimpse(Mexico_pac)

ggplot(Mexico_pac, aes(x = year, y = total_amount / 1000000, color = party)) +
  geom_line(linewidth = 1.1) +
  labs(
    y = "Total amount",
    x = "Year",
    title = "Mexico-connected PACs'  Contributions to US political parties"
  ) + 
  scale_color_manual(name = "Party", values = c("Democrat" = "blue", "Republican" = "red")) +
  scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "M"))

Considered Mexico for the follow-up plot. Unlike UK, Mexico has been traditionally a low player in terms of $ value also seems, there is sharp increase for Republicansaround 2008-10 while also faing sharp decline in later years

3 - Median housing prices in the US

housing <- read_csv("./data/median-housing.csv")
recessions <- read_csv("./data/recessions.csv")

housing <- housing |>
  rename(date = DATE, price = MSPUS) |>
  mutate(date = ymd(date)) 
recessions <- recessions |>
  rename(peak = Peak, trough = Trough) |>
  mutate(
    peak = ymd(peak),
    trough = ymd(trough)
  )

y_breaks <- seq(0, 400000, by = 40000)
# Plot
ggplot(housing, aes(x = date, y = price)) +
  geom_line(color = "royalblue3", linewidth = 1.2) +
  scale_y_continuous(
    labels = scales::comma,
    breaks = y_breaks,  
    limits = c(0, 400000)  
  )+
  scale_x_date(
    limits = c(ymd("1963-01-01"), NA),
    date_breaks = "5 years",
    date_labels = "%Y"
  )  +
  labs(
    title = "Median sales price of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    y = "Dollars",
    x = NULL,
    caption = "Sources: Census; HUD"
  ) + 
  theme_minimal() +
  theme(
    plot.title.position = "plot",
    plot.title = element_text(size = 16, face = "bold",  hjust = 0),
    
    plot.subtitle = element_text(size = 12, hjust = 0),
    plot.caption = element_text(size = 9),
    panel.grid.major.x = element_blank(), 
    panel.grid.minor.x = element_blank()   
  )+
  theme_minimal(base_size = 14) 

colnames(housing) <- c("date", "price")
colnames(recessions) <- c("start", "end")
housing$date <- as.Date(housing$date)
recessions$start <- as.Date(recessions$start)
recessions$end <- as.Date(recessions$end)

housing_range <- range(housing$date)

recessions$in_range <- recessions$end >= housing_range[1] & recessions$start <= housing_range[2]

recessions_in_range <- recessions[recessions$in_range, ]

ggplot(housing, aes(x = date, y = price)) +
  
  # Add shaded rectangles for each recession
  geom_rect(data = recessions_in_range,
            aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf),
            inherit.aes = FALSE, fill = "grey", alpha = 0.4) +
  # Plot the line on top
  geom_line(color = "royalblue3", size = 1) +
  scale_y_continuous(
    labels = scales::comma,
    breaks = y_breaks,  
    limits = c(0, 400000)  
  ) +
  scale_x_date(
    limits = c(ymd("1963-01-01"), NA),
    date_breaks = "5 years",
    date_labels = "%Y"
  )  +
  labs(
    title = "Median sales price of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Shaded areas indicate U.S. recessions\nSources: Census; HUD"
  ) +
  theme_minimal() +
  theme(
    plot.title.position = "plot",
    plot.title = element_text(size = 16, face = "bold",  hjust = 0),
    plot.subtitle = element_text(size = 12, hjust = 0),
    plot.caption = element_text(size = 9),
    
    panel.grid.major.x = element_blank(), 
    panel.grid.minor.x = element_blank()  
  )

housing_subset <- housing %>%
  filter(date >= as.Date("2019-01-01") & date < as.Date("2020-12-31")) %>%
  mutate(
    year = year(date),
    quarter = case_when(
      month(date) %in% 1:3 ~ "Q1",
      month(date) %in% 4:6 ~ "Q2",
      month(date) %in% 7:9 ~ "Q3",
      month(date) %in% 10:12 ~ "Q4"
    ),
    year_quarter = factor(
      paste(year, quarter),
      levels = c("2019 Q1", "2019 Q2", "2019 Q3", "2019 Q4",
                 "2020 Q1", "2020 Q2", "2020 Q3", "2020 Q4")
    )
  )

glimpse(housing_subset)
Rows: 8
Columns: 5
$ date         <date> 2019-01-01, 2019-04-01, 2019-07-01, 2019-10-01, 2020-01-01, 2…
$ price        <dbl> 313000, 322500, 318400, 327100, 329000, 322600, 337500, 358700
$ year         <dbl> 2019, 2019, 2019, 2019, 2020, 2020, 2020, 2020
$ quarter      <chr> "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4"
$ year_quarter <fct> 2019 Q1, 2019 Q2, 2019 Q3, 2019 Q4, 2020 Q1, 2020 Q2, 2020 Q3,…
ggplot(housing_subset, aes(x = year_quarter, y = price, group = 1)) +
  geom_line(color = "royalblue3", size = 1.2) +
  geom_point(shape = 21, fill = "white", color = "royalblue3", size = 1, stroke = 1.2) +
  labs(
    title = "Median sales price of new houses in the U.S.",
    subtitle = "Not Seasonally adjusted",
    x = NULL,
    y = "Dollars"
  ) +
  scale_y_continuous(
    limits = c(300000, 360000),
    breaks = seq(300000, 360000, by = 20000),
    labels = scales::comma
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0),
    plot.subtitle = element_text(size = 12, hjust = 0),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

4 - Expect More. Plot More.

ggplot() +
  geom_circle(aes(x0 = 0, y0 = 0, r = 3, fill = "#CC0000"), 
              color = NA, show.legend = FALSE) +
  geom_circle(aes(x0 = 0, y0 = 0, r = 2, fill = "white"), 
              color = NA, show.legend = FALSE) +
  geom_circle(aes(x0 = 0, y0 = 0, r = 1, fill = "#CC0000"), 
              color = NA, show.legend = FALSE) +
  scale_fill_identity() +
  coord_fixed() +
  theme_void() +
    annotate("text", x = 0, y = -4, label = "TARGET", color = "#CC0000", size = 10, fontface = "bold") +
  annotate("text", x = 1.6, y = -4.2, label = "\u00AE", color = "#CC0000", size = 6)

Still Struggling to make this plot smaller aspect ratio…

5 - Mirror, mirror on the wall, who’s the ugliest of them all?

ggplot(penguins, aes(x = bill_length_mm, y = flipper_length_mm, size = body_mass_g)) +
  geom_point(aes(color = island, shape = species), alpha = 0.8) +
  geom_text(aes(label = species), size = 6, angle = 90, color = "orange3", hjust = 0.5, vjust = -1) +
  geom_mark_circle(aes(label = sex), fill = "red2", color = "cyan", size = 1, expand = unit(2, "mm")) +
  labs(
    title = "IMPORTANT PLOT FOR PENGUINS",
    subtitle = "Species vs Body Mass vs Island",
    x = "X-Axis",
    y = "???",
    caption = "Ugliest Ever"
  )